library(tidyverse) # for general analysis
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(fpp)
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: tseries
library(fpp3) # for predictions 
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.3 ──
## ✓ lubridate   1.7.4     ✓ feasts      0.1.5
## ✓ tsibble     0.9.3     ✓ fable       0.2.1
## ✓ tsibbledata 0.2.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x lubridate::date()       masks base::date()
## x dplyr::filter()         masks stats::filter()
## x fabletools::forecast()  masks forecast::forecast()
## x tsibble::index()        masks zoo::index()
## x tsibble::interval()     masks lubridate::interval()
## x dplyr::lag()            masks stats::lag()
## x tsibble::new_interval() masks lubridate::new_interval()
library(ggthemes) # for beautiful themes
library(TSA) # for TS analysis
## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast
## 
## Attaching package: 'TSA'
## The following object is masked from 'package:readr':
## 
##     spec
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(kableExtra) # for beautiful tables
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(lubridate) # for time data
library(tsibble) # for tsiblle data format
library(caret) # for modeling
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
library(stats)
library(ggplot2)
library(MuMIn)
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## The following object is masked from 'package:fabletools':
## 
##     accuracy
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(tstools)


# Set a theme
theme_set(theme_minimal())

Exploratory Data Analysis

Data is imported and transformed into a time series object.

sales_month <- read_csv("Zillow monthly raw data (full).csv") #import raw data file
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   RegionName = col_character(),
##   RegionType = col_character(),
##   StateName = col_character()
## )
## See spec(...) for full column specifications.
chicago_sales_m <- sales_month %>% filter(RegionName == "Chicago, IL") #import Chicago data only 

chicago_sales_m <- chicago_sales_m %>% #select median price and data only 
                 gather(key = "Date", value = "Price", -RegionID:-StateName) %>%
                 select(Date, Price) %>%
                 rename(Median_price = Price)

chicago_sales_m$Date <- mdy(chicago_sales_m$Date)

dim(chicago_sales_m)
## [1] 151   2
chi_median <- ts(chicago_sales_m$Median_price, start=c(2008,2), frequency=12)

Data is complete, and in appropriate form.

sum(is.na(chi_median)) #check for null values
## [1] 0
frequency(chi_median) #check correct frequency of time series
## [1] 12
cycle(chi_median) #check appropriate structure of time series 
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008       2   3   4   5   6   7   8   9  10  11  12
## 2009   1   2   3   4   5   6   7   8   9  10  11  12
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1   2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3   4   5   6   7   8   9  10  11  12
## 2019   1   2   3   4   5   6   7   8   9  10  11  12
## 2020   1   2   3   4   5   6   7   8

There are two distinctive trends - a negative trend during and in the wake of the 2008 financial crisis (2008-2013), followed by a positive trend until the present. There is also clear additive seasonality. However, this seasonal trend has been impacted by the current COVID-19 pandemic.

autoplot(chi_median, main="Chicago: Median Home Sale Price")

min(chi_median) #$149,000, which correspondts to February 2013
## [1] 149000
max(chi_median) #$279,000 which corresponds to August 2020
## [1] 270000

Decomposition of the time series validates initial impression of the data. However there appears to be more than one seasonal trend, evidenced by the “seasonal” and “random” decompsitions.

plot(decompose(chi_median))

Median price peaks during the summer months. The range between median prices is also narrower in the spring/summer months than in the rest of the year.

boxplot(chi_median~cycle(chi_median), xlab="Cycle (Year)", ylab="Dollars", main="Chicago: Median Home Sales Price")

Both the ACF and PACF decay sinusodially slowly over time, indicating seasonality and a significant linear relationship between the series and its lags.

acf(chi_median, main="ACF Chicago Median Home Sales", lag.max=50)

pacf(chi_median, main="PACF Chicago Median Home Sales", lag.max=50)

Looking at the Raw Periodogram, we can see that there are many spikes and therefore many dominating frequencies. The periodgram has two spikes, which will be most important to the overall signal.

spectrum(chi_median)

periodogram(chi_median)

Create train/test split for modelling.

chi_train<-window(chi_median, end = 2019.6) #end train in August 2019
chi_test<-window(chi_median, start = 2019.6) #use last year of dataset as test

#double-check time series structure
frequency(chi_train) 
## [1] 12
cycle(chi_train) 
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008       2   3   4   5   6   7   8   9  10  11  12
## 2009   1   2   3   4   5   6   7   8   9  10  11  12
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1   2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3   4   5   6   7   8   9  10  11  12
## 2019   1   2   3   4   5   6   7   8
frequency(chi_test)
## [1] 12
cycle(chi_test) 
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2019                                   9  10  11  12
## 2020   1   2   3   4   5   6   7   8

Holt-Winters Seasonal Method

The Holt-Winters Seasonal Method was selected based on its appropriateness for time series with both a trend and seasonality. Two additive HW models were created - one with and one without damping.

The first model, which does not include damping, does not completely capture the autocorrelation in the time series. There remains significant autocorellation in the residuals, and the residuals are not normally distributed.

HW1 <- hw(chi_train, seasonal = "additive",h=12, damped = FALSE) 
plot(HW1)

checkresiduals(HW1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Holt-Winters' additive method
## Q* = 70.68, df = 8, p-value = 3.598e-12
## 
## Model df: 16.   Total lags used: 24

Adding damping does not improve the model. The residuals do not resemble white noise.

HW2 <- hw(chi_train, seasonal = "additive",h=12, damped = TRUE) 
plot(HW2)

checkresiduals(HW2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Damped Holt-Winters' additive method
## Q* = 68.899, df = 7, p-value = 2.466e-12
## 
## Model df: 17.   Total lags used: 24

sArima

From the EDA, we know that there is significant seasonality in the data, suggesting that a sArima model might be appropriate. We begin exploring ways to make the data stationary.

We first check if a Box-Cox Transformation is necessary, and find that lambda should be set to 1.999 (rounded to 2 for simplicity). Once the Box-Cox Transformation is completed, a KPSS Test for stationarity is completed. The series is not stationary.

BoxCox.lambda(chi_train) # lambda = 1.999924, significantly different from 1
## [1] 1.999924
lambda <- 2 # round up lambda value to 2 for simplicity 
transformed <- BoxCox(chi_train, lambda=lambda) # BoxCox Transformation
kpss.test(transformed) # p=0.0141, data post-Box-Cox transformation is not stationary 
## 
##  KPSS Test for Level Stationarity
## 
## data:  transformed
## KPSS Level = 0.69386, Truncation lag parameter = 4, p-value = 0.0141

The time series becomes stationary after 1st order differencing.

differenced <- diff(transformed) #difference transformed data
kpss.test(differenced) # KPSS test - p=0.1 > 0.05, data is stationary after Box-Cox transformation and 1 round of differencing 
## Warning in kpss.test(differenced): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  differenced
## KPSS Level = 0.13632, Truncation lag parameter = 4, p-value = 0.1
tsdisplay(differenced, lag=50) #visualize new dataset 

periodogram(differenced) #uncertain of if we only look at the periodgram for raw data

A sArima model is built using auto.arima, with differencing=1 and lambda=2. This results in an ARIMA(2,1,2)(0,1,1)[12] model. The Ljung-Box test shows that there is not significant autocorrelation remaining in the residuals - they resemble white noise.

AR1 <-  auto.arima(chi_train, d=1, lambda=lambda)
summary(AR1)
## Series: chi_train 
## ARIMA(2,1,2)(0,1,2)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1      ar2      ma1     ma2     sma1     sma2
##       -0.1436  -0.3785  -0.1495  0.6499  -0.2587  -0.2835
## s.e.   0.2826   0.1670   0.2313  0.1251   0.1151   0.1079
## 
## sigma^2 estimated as 9.664e+17:  log likelihood=-2786.77
## AIC=5587.54   AICc=5588.49   BIC=5607.39
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 910.0087 4697.268 3476.997 0.4844208 1.755815 0.2767997 -0.0248055
checkresiduals(AR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)(0,1,2)[12]
## Q* = 23.441, df = 18, p-value = 0.1742
## 
## Model df: 6.   Total lags used: 24
plot(forecast(AR1))

Further exploration is necessary to see if a superior sArima model can be built.

eacf(chi_train)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x  x  x  x 
## 1 x x o o x x x o o x x  x  x  x 
## 2 x o x o o o x o o o o  x  o  o 
## 3 x o o o o o o o o o o  x  o  o 
## 4 o x x o o o o o o o o  x  o  o 
## 5 x o o x o o o o o o o  x  x  x 
## 6 x x o x x o o o o o o  x  x  o 
## 7 x x o x o o o o o o o  x  x  x

We experiment with different combinations of p=1/2, q=1/2, and P=0/1. Of the models tested, only AR2 (Arima(2,1,2)(1,1,1)[12]) passes the Ljung-Box test (assuming 0.05 threshold).

AR2 <-  Arima(chi_train, order=c(2,1,2), seasonal=c(1,1,1),lambda=lambda) #p=0.09
summary(AR2)
## Series: chi_train 
## ARIMA(2,1,2)(1,1,1)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1      ar2      ma1     ma2    sar1     sma1
##       -0.1324  -0.3694  -0.1473  0.6326  0.3773  -0.7245
## s.e.   0.3064   0.1669   0.2496  0.1421  0.1927   0.1599
## 
## sigma^2 estimated as 9.937e+17:  log likelihood=-2788.21
## AIC=5590.42   AICc=5591.37   BIC=5610.28
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 912.5512 4732.661 3506.983 0.4828152 1.769271 0.2791869
##                     ACF1
## Training set -0.02600148
checkresiduals(AR2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)(1,1,1)[12]
## Q* = 26.573, df = 18, p-value = 0.08736
## 
## Model df: 6.   Total lags used: 24
AR3 <-  Arima(chi_train, order=c(1,1,1), seasonal=c(1,1,1),lambda=lambda) #p=0.005
summary(AR3)
## Series: chi_train 
## ARIMA(1,1,1)(1,1,1)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1     ma1   sar1     sma1
##       -0.4016  0.0957  0.320  -0.6747
## s.e.   0.1858  0.1938  0.204   0.1732
## 
## sigma^2 estimated as 1.052e+18:  log likelihood=-2792.51
## AIC=5595.01   AICc=5595.51   BIC=5609.19
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 1060.193 4891.313 3731.734 0.5555307 1.88292 0.2970791
##                      ACF1
## Training set -0.006238075
checkresiduals(AR3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 40.2, df = 20, p-value = 0.004712
## 
## Model df: 4.   Total lags used: 24
AR4 <-  Arima(chi_train, order=c(1,1,2), seasonal=c(0,1,1),lambda=lambda) #p=0.01
summary(AR4)
## Series: chi_train 
## ARIMA(1,1,2)(0,1,1)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1      ma1     ma2     sma1
##       -0.0480  -0.2462  0.2981  -0.4112
## s.e.   0.2072   0.1836  0.1230   0.1209
## 
## sigma^2 estimated as 1.044e+18:  log likelihood=-2791.84
## AIC=5593.69   AICc=5594.19   BIC=5607.87
## 
## Training set error measures:
##                    ME   RMSE      MAE       MPE     MAPE      MASE         ACF1
## Training set 785.2016 4810.3 3652.513 0.4181308 1.837683 0.2907724 -0.009892509
checkresiduals(AR4)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)(0,1,1)[12]
## Q* = 37.689, df = 20, p-value = 0.009664
## 
## Model df: 4.   Total lags used: 24
AR5 <-  Arima(chi_train, order=c(2,1,1), seasonal=c(0,1,1),lambda=lambda) #p=0.002
summary(AR5)
## Series: chi_train 
## ARIMA(2,1,1)(0,1,1)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##          ar1     ar2      ma1     sma1
##       0.0366  0.1842  -0.3266  -0.4154
## s.e.  0.4093  0.1456   0.4079   0.1186
## 
## sigma^2 estimated as 1.071e+18:  log likelihood=-2793.38
## AIC=5596.77   AICc=5597.27   BIC=5610.95
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 880.3428 4880.438 3743.138 0.4651391 1.882288 0.2979869
##                     ACF1
## Training set -0.02164737
checkresiduals(AR5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,1)(0,1,1)[12]
## Q* = 43.317, df = 20, p-value = 0.001856
## 
## Model df: 4.   Total lags used: 24

Next, we compare the AICc and BIC scores to the two models that passed the Ljung-Box Test. The model selected by the auto.arima superior has lower AICc and BIC values, showing that it is the better choice.

AR_AB <- c(AR1$aicc, AR2$aicc, AR1$bic, AR2$bi)
names=c("AR1: AICc","AR2: AICc","AR1: BIC","AR2: BIC")

barplot(AR_AB, names=names, ylim=c(5585,5615), main="Arima Models", col='azure')

Next, we use the ARIMA(2,1,2)(0,1,1)[12] model to forecast results for the next year. The “Actual vs. Predicted” chart shows that the model does a decent job of forecasting values pre-COVID, but is not capable of accurately predicted the trend for the past few months.

#model 1
AR1_fcast <- forecast(AR1, h=12)
plot(AR1_fcast, chi_test)

tsplot(cbind(chi_test, AR1_fcast$mean), plot_title="Actual vs. Predicted")

MAE(AR1_fcast$mean, chi_test)
## [1] 4971.849
RMSE(AR1_fcast$mean, chi_test)
## [1] 7592.068

Next we do cross-validation to understand how the model’s performance evolves over time.

k <- 72 # minimum observations (6 years)
n <- length(chi_median) # Number of data points
p <- 12 # Period
H <- 12 # Forecast horizon

defaultW <- getOption("warn") 
options(warn = -1)

st <- tsp(chi_median)[1]+(k-2)/p #  gives the start time in time units,

error_AR1 <- matrix(NA,n-k,H) # One-year forecast horizon error for each window

AICc_AR1 <- matrix(NA,n-k) # Estimated model AICc value for each wndow

for(i in 1:(n-k))

{
  
  # rolling window
  train <- window(chi_median, start=st+(i-k+1)/p, end=st+i/p) ## Window Length: k
  
  test <- window(chi_median, start=st + (i+1)/p, end=st+(i+H)/p) ## Window Length: H
  
  #Arima Models
  
  AR12 <-  Arima(train, order=c(2,1,2), seasonal=list(order=c(0,1,1), period=p), lambda='auto', method='ML') #ARIMA(2,1,2)(0,1,1)[12]

  fcast_AR1 <- forecast(AR12, h=H)

  # Error & AICc

  error_AR1[i,1:length(test)] <- fcast_AR1[['mean']]-test

  AICc_AR1[i] <- AR12$aicc
}
MAE_AR1 <- colMeans(abs(error_AR1), na.rm=TRUE)
RMSE_AR1 <- sqrt(colMeans(error_AR1**2, na.rm=TRUE))

plot(1:12, MAE_AR1, type="l",col=3,xlab="Horizon", ylab="Error", ylim=c(3600,8000))
lines(1:12, RMSE_AR1, type="l",col=2)
legend("topleft",legend=c("AR1 - MAE", "AR - RMSE"),col=1:4,lty=1)

plot(1:79, AICc_AR1, type="l",col=1,xlab="Iterations", ylab="AICc", ylim=c(750,2650))
legend("bottomright",legend="AR1 - AICc",col=1:4,lty=1)

Regression with Arima errors

As previously mentioned, the sArima model was not able to capture the changed environment due to COVID. Therefore, we will try Regression with Arima errors, in order to include other leading predictors.

We tried Google Trends data for two search terms: “homes for sale” and “realtor”.

First, we import the datasets and double-check dimensions.

homes <- read_csv("home for sale.csv") #Google Trends "home for sale" Data 
## Parsed with column specification:
## cols(
##   Month = col_character(),
##   `Home for sale` = col_double()
## )
realtor <- read_csv("realtor.csv") #Google Trends "realtor" Data 
## Parsed with column specification:
## cols(
##   Month = col_character(),
##   Realtor = col_double()
## )
homes$Month <- yearmonth(homes$Month) 
realtor$Month <- yearmonth(realtor$Month) 

dim(homes)
## [1] 151   2
dim(realtor)
## [1] 151   2
sum(is.na(homes))
## [1] 0
sum(is.na(realtor))
## [1] 0

Store as TS object

homes_ts <- ts(homes$`Home for sale`, frequency=12, start=c(2008,2))
realtor_ts <- ts(realtor$Realtor, frequency=12, start=c(2008,2))

Visualize the median sale price (scaled) dataset and the two Google Trends datasets. They show similar seasonality, suggesting that they might be appropriate predictors.

chi_median_scale <- chi_median/1000
tsplot(cbind(chi_median_scale, homes_ts, realtor_ts), plot_title="Trends")

Train/test split

home_train<-window(homes_ts, end = 2019.6) #end train in August 2019
home_test<-window(homes_ts, start = 2019.6) #use last year of dataset as test

realtor_train<-window(realtor_ts, end = 2019.6) #end train in August 2019
realtor_test<-window(realtor_ts, start = 2019.6) #use last year of dataset as test

Transform realtor data with a lambda value of 0.3 prior to fitting the regression model.

lambda_realtor <- BoxCox.lambda(realtor_train) # lambda = 0.3, significantly different from 1
transformed_realtor <- BoxCox(realtor_train, lambda=lambda_realtor) # BoxCox Transformation

The regression model using the transformed_realtor dataset does not pass the Ljung-Box test for stationarity.

RAR1 <- auto.arima(y=chi_train, lambda=lambda, xreg=transformed_realtor) #auto.arima model 
summary(RAR1) # b) summary 
## Series: chi_train 
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##            xreg
##       276301056
## s.e.  146286849
## 
## sigma^2 estimated as 1.354e+18:  log likelihood=-2808.52
## AIC=5621.04   AICc=5621.14   BIC=5626.71
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 524.6034 5691.497 4194.314 0.2686517 2.123857 0.3339045 -0.3945342
checkresiduals(RAR1) # c) check residuals 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 80.737, df = 23, p-value = 2.412e-08
## 
## Model df: 1.   Total lags used: 24

The “home for sale” dataset did not need to be transformed.

BoxCox.lambda(home_train) # lambda = 0.9, similar to 1 so won't transform
## [1] 0.9184948

Similar to the other regression model, there remains significant autocrrelation in the residuals for this model as well.

RAR2 <- auto.arima(y=chi_train, lambda=lambda, xreg=home_train) #auto.arima model 
summary(RAR2) # b) summary 
## Series: chi_train 
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           xreg
##       18125128
## s.e.  10579886
## 
## sigma^2 estimated as 1.334e+18:  log likelihood=-2807.56
## AIC=5619.12   AICc=5619.22   BIC=5624.79
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 527.7724 5658.816 4153.264 0.2705263 2.107745 0.3306366 -0.3652482
checkresiduals(RAR2) # c) check residuals 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 73.9, df = 23, p-value = 2.996e-07
## 
## Model df: 1.   Total lags used: 24

Save as TS object, do train/test split

gtrends <- cbind(homes, realtor$Realtor)
gtrends_ts <- ts(gtrends, frequency=12, start=c(2008,2))
gtrends_ts <- cbind(gtrends_ts[,'Home for sale'], gtrends_ts[,'realtor$Realtor'])

gtrends_train<-window(gtrends_ts, end = 2019.6) #end train in August 2019
gtrends_test<-window(gtrends_ts, start = 2019.6) #use last year of dataset as test

As with other regression models, there remains significant autocorrelation in the residuals. Will experiment with other predictors later time permitting.

RAR3 <- auto.arima(y=chi_train, lambda=lambda, xreg=gtrends_train) #auto.arima model 
summary(RAR3) # b) summary 
## Series: chi_train 
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##       gtrends_ts[, "Home for sale"]  gtrends_ts[, "realtor$Realtor"]
##                            17242596                          7887560
## s.e.                       10260298                         13730589
## 
## sigma^2 estimated as 1.342e+18:  log likelihood=-2807.41
## AIC=5620.83   AICc=5621.03   BIC=5629.34
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE       ACF1
## Training set 523.5547 5648.847 4145.484 0.2681766 2.10269 0.3300172 -0.3662402
checkresiduals(RAR3) # c) check residuals 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 74.904, df = 22, p-value = 1.095e-07
## 
## Model df: 2.   Total lags used: 24

Dynamic Harmonic Regression

Recall Periodgram points to two major spikes.

temp <- periodogram(chi_train)

spec_p <- matrix(round(temp$spec/1e5, 2))
freq_p <- matrix(temp$freq)
cbind(spec_p, freq_p) # two major spikes at 0.00625 and 0.08125
##            [,1]        [,2]
##  [1,] 572397.76 0.006944444
##  [2,]  40650.63 0.013888889
##  [3,]   2355.93 0.020833333
##  [4,]   7906.67 0.027777778
##  [5,]   8553.56 0.034722222
##  [6,]   5616.07 0.041666667
##  [7,]   6845.62 0.048611111
##  [8,]   1028.18 0.055555556
##  [9,]   4165.32 0.062500000
## [10,]   5444.90 0.069444444
## [11,]   2769.42 0.076388889
## [12,] 217995.33 0.083333333
## [13,]  20085.74 0.090277778
## [14,]    942.00 0.097222222
## [15,]   1369.46 0.104166667
## [16,]    405.27 0.111111111
## [17,]   2350.98 0.118055556
## [18,]   1266.73 0.125000000
## [19,]    455.99 0.131944444
## [20,]   1762.04 0.138888889
## [21,]    582.42 0.145833333
## [22,]   1400.40 0.152777778
## [23,]    186.61 0.159722222
## [24,]   6555.05 0.166666667
## [25,]   3283.36 0.173611111
## [26,]    102.17 0.180555556
## [27,]    113.86 0.187500000
## [28,]    393.28 0.194444444
## [29,]     23.64 0.201388889
## [30,]     41.94 0.208333333
## [31,]     13.99 0.215277778
## [32,]    136.43 0.222222222
## [33,]    485.21 0.229166667
## [34,]    306.49 0.236111111
## [35,]    228.94 0.243055556
## [36,]   4207.25 0.250000000
## [37,]    537.30 0.256944444
## [38,]    386.98 0.263888889
## [39,]    211.18 0.270833333
## [40,]    179.81 0.277777778
## [41,]    436.47 0.284722222
## [42,]    408.08 0.291666667
## [43,]    361.11 0.298611111
## [44,]    534.24 0.305555556
## [45,]    496.54 0.312500000
## [46,]    646.79 0.319444444
## [47,]    617.71 0.326388889
## [48,]   1392.31 0.333333333
## [49,]    235.67 0.340277778
## [50,]    280.66 0.347222222
## [51,]    832.83 0.354166667
## [52,]    844.08 0.361111111
## [53,]    140.30 0.368055556
## [54,]     25.36 0.375000000
## [55,]    370.71 0.381944444
## [56,]     12.25 0.388888889
## [57,]    215.70 0.395833333
## [58,]    366.16 0.402777778
## [59,]    258.74 0.409722222
## [60,]    624.84 0.416666667
## [61,]    155.58 0.423611111
## [62,]    100.53 0.430555556
## [63,]    409.91 0.437500000
## [64,]    155.03 0.444444444
## [65,]    290.05 0.451388889
## [66,]     11.59 0.458333333
## [67,]    295.37 0.465277778
## [68,]    739.64 0.472222222
## [69,]    349.04 0.479166667
## [70,]     20.73 0.486111111
## [71,]    290.61 0.493055556
## [72,]   1659.42 0.500000000
(1/0.00625)/12 # once every 13 years?? 
## [1] 13.33333
(1/0.08125)/12 # once every year?? 
## [1] 1.025641

We create a regression model using a Fourier transformation (K=2) as the predictor, and Arima errors. This results in an ARIMA(0,2,3)(1,0,0)[12] model. The residuals are stationary and relatively normally distributed, signaling that they are white noise.

DHR1 <-  auto.arima(chi_train, xreg=fourier(chi_train, K=2)) # in example seasonal=FALSE, not sure if that should be the setting here? 
summary(DHR1)
## Series: chi_train 
## Regression with ARIMA(0,2,3)(1,0,0)[12] errors 
## 
## Coefficients:
##           ma1     ma2      ma3    sar1     S1-12       C1-12       S2-12
##       -1.2948  0.5287  -0.2133  0.6168  1969.158  -15229.860  -2614.3153
## s.e.   0.0831  0.1542   0.1143  0.0702  2140.429    2149.604    980.9099
##          C2-12
##       1346.945
## s.e.   977.806
## 
## sigma^2 estimated as 24592403:  log likelihood=-1359.98
## AIC=2737.96   AICc=2739.38   BIC=2764.24
## 
## Training set error measures:
##                    ME    RMSE      MAE       MPE    MAPE      MASE        ACF1
## Training set 629.7414 4777.36 3773.858 0.2944363 1.88085 0.3004325 -0.02342825
checkresiduals(DHR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,2,3)(1,0,0)[12] errors
## Q* = 25.555, df = 16, p-value = 0.06062
## 
## Model df: 8.   Total lags used: 24

This model is less accurate than the sArima model used for prediction earlier, although both models struggle with the same thing: predicting values during the COVID period.

#model 2

DHR1_fcast <- forecast(DHR1, h=12, xreg=fourier(chi_train, K=2, h=12))
plot(DHR1_fcast, chi_test)

tsplot(cbind(chi_test, DHR1_fcast$mean), plot_title="Actual vs. Predicted")

MAE(DHR1_fcast$mean, chi_test)
## [1] 5604.526
RMSE(DHR1_fcast$mean, chi_test)
## [1] 8589.457

Intervention

Will come back to this

#arimax(y,order=c(1,0,1), xreg=data.frame(x=c(rep(0,200),1:200)))
#xreg=data.frame(x=c(rep(0,200),1:200)))